Classification with tidymodels

Review

  • confusion matrix
  • accuracy
  • precision
  • recall/sensitivty

Exercise 1

Example 1 uses data about penguins from the Palmer Archipelago in Antarctica. The data include measurements about three different species of penguins. This example only considers two classes and does not use resampling methods because only one model is estimated.

library(tidyverse)
library(tidymodels)
library(palmerpenguins)

# drop to two species
penguins_small <- 
  bind_cols(
    penguins,
    random = runif(nrow(penguins))
  ) %>%
  mutate(
    species = 
      case_when(
        species == "Adelie" ~ "Adelie",
        species == "Gentoo" ~ "Gentoo",
        species == "Chinstrap" & random < 0.5 ~ "Adelie",
        species == "Chinstrap" & random > 0.5 ~ "Gentoo"
        )
    ) %>%
  mutate(species = factor(species)) %>%
  select(-random)

# look at missing data
map_dbl(.x = penguins_small, .f = ~ sum(is.na(.x)))
##           species            island    bill_length_mm     bill_depth_mm 
##                 0                 0                 2                 2 
## flipper_length_mm       body_mass_g               sex              year 
##                 2                 2                11                 0
# drop missing values
penguins_small <- penguins_small %>%
  filter(complete.cases(.))

Step 1. Split the data into training and testing data

set.seed(20201013)

# create a split object
penguins_small_split <- initial_split(data = penguins_small, prop = 0.8)

# create the training and testing data
penguins_small_train <- training(x = penguins_small_split) 
penguins_small_test <- testing(x = penguins_small_split)

rm(penguins_small)

Step 2. EDA

penguins_small_train %>%
  ggplot(aes(x = flipper_length_mm, y = bill_length_mm, color = species)) +
  geom_point() +
  theme_minimal()

Step 3. Create resamples

set.seed(20201217)
folds <- vfold_cv(data = penguins_small_train, v = 10)

Step 4. Create a model specification

knn_recipe <- 
  recipe(formula = species ~ ., data = penguins_small_train) %>%
  step_range(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)

knn_mod <- 
  nearest_neighbor(neighbors = tune()) %>%
  set_engine(engine = "kknn") %>%
  set_mode(mode = "classification")

knn_workflow <- 
  workflow() %>% 
  add_model(spec = knn_mod) %>% 
  add_recipe(recipe = knn_recipe)

Step 5. Estimate the models

knn_grid <- tibble(neighbors = seq(from = 1, to = 15, by = 2))

knn_res <-  
  knn_workflow %>% 
  tune_grid(resamples = folds,
            grid = knn_grid)

knn_res %>%
  collect_metrics()
## # A tibble: 16 x 7
##    neighbors .metric  .estimator  mean     n std_err .config             
##        <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <fct>               
##  1         1 accuracy binary     0.899    10  0.0207 Preprocessor1_Model1
##  2         1 roc_auc  binary     0.907    10  0.0203 Preprocessor1_Model1
##  3         3 accuracy binary     0.899    10  0.0207 Preprocessor1_Model2
##  4         3 roc_auc  binary     0.951    10  0.0195 Preprocessor1_Model2
##  5         5 accuracy binary     0.907    10  0.0215 Preprocessor1_Model3
##  6         5 roc_auc  binary     0.969    10  0.0180 Preprocessor1_Model3
##  7         7 accuracy binary     0.907    10  0.0215 Preprocessor1_Model4
##  8         7 roc_auc  binary     0.972    10  0.0163 Preprocessor1_Model4
##  9         9 accuracy binary     0.903    10  0.0223 Preprocessor1_Model5
## 10         9 roc_auc  binary     0.971    10  0.0163 Preprocessor1_Model5
## 11        11 accuracy binary     0.910    10  0.0185 Preprocessor1_Model6
## 12        11 roc_auc  binary     0.976    10  0.0144 Preprocessor1_Model6
## 13        13 accuracy binary     0.910    10  0.0185 Preprocessor1_Model7
## 14        13 roc_auc  binary     0.976    10  0.0145 Preprocessor1_Model7
## 15        15 accuracy binary     0.907    10  0.0185 Preprocessor1_Model8
## 16        15 roc_auc  binary     0.976    10  0.0145 Preprocessor1_Model8

Exercise 2

We’re going to repeat exercise 1 with a CART model instead of KNN.

# create a cart model object
cart_mod <- 
  decision_tree() %>%
  set_engine(engine = "rpart") %>%
  set_mode(mode = "classification")

cart_workflow <- 
  workflow() %>% 
  add_model(spec = cart_mod) %>% 
  add_recipe(recipe = knn_recipe)

cart_res <-  
  cart_workflow %>% 
  tune_grid(resamples = folds,
            grid = knn_grid)

cart_res %>%
  collect_metrics()
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <fct>               
## 1 accuracy binary     0.870    10  0.0281 Preprocessor1_Model1
## 2 roc_auc  binary     0.937    10  0.0196 Preprocessor1_Model1

Exercise 3

We compared CART and KNN across the resamples. Let’s estimate CART on all of the training data.

Step 1. Estimate on all of the data

final_mod <- cart_mod %>%
  fit(formula = species ~ ., data = penguins_small_train)

Step 2. Make predictions on the testing data

# predict the predicted class and the predicted probability of each class
predictions <- bind_cols(
  penguins_small_test,
  predict(object = final_mod, new_data = penguins_small_test),
  predict(object = final_mod, new_data = penguins_small_test, type = "prob")
)

select(predictions, species, starts_with(".pred")) %>%
  sample_n(10)
## # A tibble: 10 x 4
##    species .pred_class .pred_Adelie .pred_Gentoo
##    <fct>   <fct>              <dbl>        <dbl>
##  1 Adelie  Adelie            0.981        0.0185
##  2 Gentoo  Gentoo            0.0106       0.989 
##  3 Gentoo  Gentoo            0.0106       0.989 
##  4 Gentoo  Adelie            0.981        0.0185
##  5 Gentoo  Adelie            0.655        0.345 
##  6 Gentoo  Gentoo            0.0106       0.989 
##  7 Gentoo  Gentoo            0.0106       0.989 
##  8 Adelie  Adelie            0.981        0.0185
##  9 Gentoo  Gentoo            0.0106       0.989 
## 10 Adelie  Adelie            0.981        0.0185

Step 3. Look at the model

rpart.plot::rpart.plot(x = final_mod$fit)

Step 4. Evaluate the model

Create a confusion matrix:

conf_mat(data = predictions,
         truth = species,
         estimate = .pred_class)
##           Truth
## Prediction Adelie Gentoo
##     Adelie     31      5
##     Gentoo      5     25

“Adelie” is the “event”.

  1. Calculate the accuracy
  2. Calculate the precision
  3. Calculate the sensitivity

Answers

  1. Calculate the accuracy

\[Accuracy = \frac{TP + TN}{total} = \frac{32 + 27}{66} = \frac{59}{66} \approx 0.894\]

accuracy(data = predictions,
         truth = species,
         estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.848
  1. Calculate the precision

\[Precision = \frac{TP}{TP + FP} = \frac{32}{32 + 4} = \frac{32}{36} \approx 0.889\]

precision(data = predictions,
          truth = species,
          estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.861
  1. Calculate the recall/sensitivity

\[Sensitivity = \frac{32}{32 + 3} = \frac{32}{35} \approx 0.914\]

recall(data = predictions,
       truth = species,
       estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.861

Step 6. Make a New Prediction

Photo by: Lescroël, A. L.; Ballard, G.; Grémillet, D.; Authier, M.; Ainley, D. G. (2014)

new_penguins <- tribble(
  ~island, ~bill_length_mm, ~bill_depth_mm, ~flipper_length_mm, ~body_mass_g, ~sex, ~year,
  "Torgersen", 40, 19, 190, 4000, "male", 2008
)

predict(object = final_mod, new_data = new_penguins)
## # A tibble: 1 x 1
##   .pred_class
##   <fct>      
## 1 Adelie
predict(object = final_mod, new_data = new_penguins, type = "prob")
## # A tibble: 1 x 2
##   .pred_Adelie .pred_Gentoo
##          <dbl>        <dbl>
## 1        0.981       0.0185

Bonus

Variable Importance

library(vip)

final_mod %>% 
  vip(num_features = 10)